In [1]:
%matplotlib inline
Code for building the models
Author: Jimmy Charité
Email: jimmy.charite@gmail.com
Experimenting with tensorflow
In [2]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import Image
from IPython.core.display import HTML
import tensorflow as tf
In [3]:
retval=os.chdir("..")
In [4]:
clean_data=pd.read_pickle('./clean_data/clean_data.pkl')
In [5]:
clean_data.head()
Out[5]:
In [6]:
kept_cols=['helpful']
kept_cols.extend(clean_data.columns[9:])
In [7]:
my_rand_state=0
test_size=0.25
In [8]:
from sklearn.model_selection import train_test_split
In [9]:
X = (clean_data[kept_cols].iloc[:,1:]).as_matrix()
y = (clean_data[kept_cols].iloc[:,0]).tolist()
In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size,
random_state=my_rand_state)
In [11]:
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=len(X[0,:]))]
In [12]:
dnn_clf=tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
hidden_units=[200,100,50],
model_dir='./other_output/tf_model')
In [13]:
from sklearn.preprocessing import StandardScaler
std_scale=StandardScaler()
In [16]:
class PassData(object):
'''
Callable object that can be initialized and
used to pass data to tensorflow
'''
def __init__(self,X,y):
self.X=X
self.y=y
def scale(self):
self.X = std_scale.fit_transform(self.X, self.y)
def __call__(self):
return tf.constant(X), tf.constant(y)
In [17]:
train_data=PassData(X,y)
In [ ]:
train_data.scale()
In [ ]:
dnn_clf.fit(input_fn=train_data,steps=1000)
In [ ]:
from sklearn.metrics import roc_curve, auc
In [ ]:
nb_fpr, nb_tpr, _ = roc_curve(y_test,
nb_clf_est_b.predict_proba(X_test)[:,1])
nb_roc_auc = auc(nb_fpr, nb_tpr)
qda_fpr, qda_tpr, _ = roc_curve(y_test,
qda_clf_est_b.predict_proba(X_test)[:,1])
qda_roc_auc = auc(qda_fpr, qda_tpr)
log_fpr, log_tpr, _ = roc_curve(y_test,
log_clf_est_b.predict_proba(X_test)[:,1])
log_roc_auc = auc(log_fpr, log_tpr)
rf_fpr, rf_tpr, _ = roc_curve(y_test,
rf_clf_est_b.predict_proba(X_test)[:,1])
rf_roc_auc = auc(rf_fpr, rf_tpr)
In [ ]:
plt.plot(nb_fpr, nb_tpr, color='cyan', linestyle='--',
label='NB (area = %0.2f)' % nb_roc_auc, lw=2)
plt.plot(qda_fpr, qda_tpr, color='indigo', linestyle='--',
label='QDA (area = %0.2f)' % qda_roc_auc, lw=2)
plt.plot(log_fpr, log_tpr, color='seagreen', linestyle='--',
label='LOG (area = %0.2f)' % log_roc_auc, lw=2)
plt.plot(rf_fpr, rf_tpr, color='blue', linestyle='--',
label='RF (area = %0.2f)' % rf_roc_auc, lw=2)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k',
label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves of Basic Models Using BOW & Macro-Text Stats')
plt.legend(loc="lower right")
plt.savefig('./plots/ROC_Basic_BOW_MERGED.png', bbox_inches='tight')
plt.show()